library(tidyverse)
library(cowplot)
library(GGally)
library(heatmaply)
library(sva)
library(limma)
library(broom)
library(ggridges)
# Cells #
fa.cell.neg.raw <- read_csv("./data/abundances/fa_1and2_cells_target_negmode.csv")
Parsed with column specification:
cols(
.default = col_double(),
Samples = col_character(),
Mode = col_character(),
Type = col_character(),
Group = col_character(),
Treatment = col_character(),
Plate = col_character(),
Experiment = col_character()
)
See spec(...) for full column specifications.
fa.cell.pos.raw <- read_csv("./data/abundances/fa_1and2_cells_target_posmode.csv")
Parsed with column specification:
cols(
.default = col_double(),
Samples = col_character(),
Mode = col_character(),
Type = col_character(),
Group = col_character(),
Treatment = col_character(),
Plate = col_character(),
Experiment = col_character()
)
See spec(...) for full column specifications.
# Media #
fa.med.neg.raw <- read_csv("./data/abundances/fa_1and2_media_target_negmode.csv")
Parsed with column specification:
cols(
.default = col_double(),
Samples = col_character(),
Mode = col_character(),
Type = col_character(),
Group = col_character(),
Treatment = col_character(),
Plate = col_character(),
Experiment = col_character()
)
See spec(...) for full column specifications.
fa.med.pos.raw <- read_csv("./data/abundances/fa_1and2_media_target_posmode.csv")
Parsed with column specification:
cols(
.default = col_double(),
Samples = col_character(),
Mode = col_character(),
Type = col_character(),
Group = col_character(),
Treatment = col_character(),
Plate = col_character(),
Experiment = col_character()
)
See spec(...) for full column specifications.
# Cells #
fa.cell.neg.compound.info <- read_csv("./data/compound_info/fa_1and2_cells_target_negmode_cmpnd_info.csv")
Parsed with column specification:
cols(
compound_short = col_character(),
compound_full = col_character(),
formula = col_character(),
mass = col_double(),
rt = col_double(),
cas_id = col_character()
)
fa.cell.pos.compound.info <- read_csv("./data/compound_info/fa_1and2_cells_target_posmode_cmpnd_info.csv")
Parsed with column specification:
cols(
compound_short = col_character(),
compound_full = col_character(),
formula = col_character(),
mass = col_double(),
rt = col_double(),
cas_id = col_character()
)
# Media #
fa.med.neg.compound.info <- read_csv("./data/compound_info/fa_1and2_media_target_negmode_cmpnd_info.csv")
Parsed with column specification:
cols(
compound_short = col_character(),
compound_full = col_character(),
formula = col_character(),
mass = col_double(),
rt = col_double(),
cas_id = col_character()
)
fa.med.pos.compound.info <- read_csv("./data/compound_info/fa_1and2_media_target_posmode_cmpnd_info.csv")
Parsed with column specification:
cols(
compound_short = col_character(),
compound_full = col_character(),
formula = col_character(),
mass = col_double(),
rt = col_double(),
cas_id = col_character()
)
# Kegg/other ID reference #
cmpnd.id.db <- read_csv("./data/other/anp_db_compound_kegg.csv")
Parsed with column specification:
cols(
compound_full = col_character(),
cas_id = col_character(),
HMDB = col_character(),
Metlin = col_character(),
KEGG = col_character()
)
MissingPerSamplePlot <- function(raw.data, start.string) {
# Counts the number of missing/NA values per sample and
# percent compounds missing out of total number of compounds per sample
# Then passes the result into a vertical bar plot, where each
# bar represents a single sample and the size of the bar
# is the % of compounds missing
counted.na <- raw.data %>%
select(starts_with(start.string)) %>%
mutate(
count.na = apply(., 1, function(x) sum(is.na(x))),
percent.na = (count.na / ncol(raw.data %>% select(starts_with(start.string)))) * 100
) %>%
dplyr::select(count.na, percent.na) %>%
bind_cols(
raw.data %>%
select(Samples, Group)
) %>%
arrange(percent.na) %>%
mutate(f.order = factor(Samples, levels = Samples))
counted.na %>%
ggplot(aes(x = f.order, y = percent.na, fill = Group)) +
geom_bar(stat = "identity") +
geom_hline(yintercept = 20, color = "gray", size = 1, alpha = 0.8) +
coord_flip()+
xlab("Samples") +
ylab("Percent missing values in sample") +
theme(axis.text.y = element_text(size = 6))
}
MissingPerCompound <- function(raw.data, start.string){
# Function to count in how many experimental samples each compound is missing
# Also calculates the % missing:
# (# NA per compound across all experimental samples) * 100 / (tot num of samples)
raw.data %>%
filter(Group == "sample") %>%
select(Samples, starts_with(start.string)) %>%
gather(key = "Compound", value = "Abundance", -Samples) %>%
group_by(Compound) %>%
summarise(
na_count = sum(is.na(Abundance)),
n_samples = n(),
percent_na = (na_count * 100 / n_samples)
) %>%
filter(na_count > 0) %>%
arrange(desc(percent_na))
}
ReplaceNAwMinLogTransformSingle <- function(raw.dataframe, start.prefix) {
# Function to replace any NAs in each column with the minimum for that column,
# separately for each sample type.
# NA in negative control samples are replaced by 2.
# Then data is log2 transformed
smpls <- raw.dataframe %>%
filter(Group == "sample") %>%
dplyr::select(starts_with(start.prefix))
smpls.min <- lapply(smpls, min, na.rm = TRUE)
# replace the missing values in the real samples with the minimum of the samples
# then take the log
smpls.noNA <- raw.dataframe %>%
filter(Group == "sample") %>%
dplyr::select(Samples:Experiment) %>%
bind_cols(
smpls %>%
replace_na(replace = smpls.min) %>%
log2()
)
# QC
QC <- raw.dataframe %>%
filter(Group == "QC") %>%
dplyr::select(starts_with(start.prefix))
QC.min <- lapply(QC, min, na.rm = TRUE)
# replace the missing values in the QC with the minimum of the QC
# then take the log
QC.noNA <- raw.dataframe %>%
filter(Group == "QC") %>%
dplyr::select(Samples:Experiment) %>%
bind_cols(
QC %>%
replace_na(replace = QC.min) %>%
log2()
)
# replace the missing values in solv and empty samples with 2 - for PCA analysis
# then take the log
other.min <- setNames(
as.list(
rep(2, ncol(
raw.dataframe %>%
dplyr::select(starts_with(start.prefix))))
),
colnames(raw.dataframe %>% dplyr::select(starts_with(start.prefix)))
)
other.num.log <- raw.dataframe %>%
filter(Group != "sample" & Group != "QC") %>%
dplyr::select(Samples:Experiment) %>%
bind_cols(
raw.dataframe %>%
filter(Group != "sample" & Group != "QC") %>%
dplyr::select(starts_with(start.prefix)) %>%
replace_na(replace = other.min) %>%
log2()
)
# combine them together back into one data frame
all.noNA <- smpls.noNA %>%
bind_rows(QC.noNA) %>%
bind_rows(other.num.log)
}
HeatmapPrepAlt <- function(raw.data, start.prefix){
# function for preparing dara for heatmap viz
x <- raw.data %>%
select(starts_with(start.prefix)) %>%
scale(center = TRUE, scale = TRUE) %>%
as.matrix()
row.names(x) <- raw.data$Samples
return(x)
}
Q: What are the distributions of compound masses and retention times?
# bind all 4 compound info df into 1
full.fa.cmpnd <- fa.cell.neg.compound.info %>%
mutate(Set = "Cells / Neg") %>%
bind_rows(fa.cell.pos.compound.info %>% mutate(Set = "Cells / Pos")) %>%
bind_rows(fa.med.neg.compound.info %>% mutate(Set = "Media / Neg")) %>%
bind_rows(fa.med.pos.compound.info %>% mutate(Set = "Media / Pos"))
full.fa.cmpnd %>%
ggplot(aes(x = rt, y = mass)) +
geom_point(size = 3, alpha = 0.3) +
xlab("Retention Time (min)") +
ylab("Mass (Da)") +
ggtitle("Mass v RT\nFA Only Exp") +
ylim(0, 1000)
full.fa.cmpnd %>%
ggplot(aes(x = rt, y = mass, color = Set)) +
geom_point(size = 3, alpha = 0.5) +
xlab("Retnetion Time (min)") +
ylab("Mass (Da)") +
ggtitle("Mass v RT\nFA Only Exp") +
facet_wrap(~ Set) +
ylim(0, 1000)
Q: Which compounds were found in one or more of the data types?
### FA cell join ###
fa.cell.cmpnd.join <- fa.cell.neg.compound.info %>%
inner_join(fa.cell.pos.compound.info, by = "cas_id", suffix = c(".c.n", ".c.p")) %>%
select(
contains("cas_id"), contains("short"),
contains("full"), starts_with("formula"),
starts_with("mass"), starts_with("rt")
)
# compound names - found in pos and neg mode / cells
print(fa.cell.cmpnd.join$compound_full.c.n)
[1] "Glycine"
[2] "Alanine"
[3] "Sarcosine"
[4] "2-Aminobutyric Acid"
[5] "GABA"
[6] "BAIBA"
[7] "Serine"
[8] "Hypotaurine"
[9] "Uracil"
[10] "Proline"
[11] "Valine"
[12] "Threonine"
[13] "Cysteine"
[14] "Taurine"
[15] "4-Oxoproline"
[16] "trans-4-Hydroxyproline"
[17] "Creatine"
[18] "Isoleucine"
[19] "Leucine"
[20] "Asparagine"
[21] "Aspartic Acid"
[22] "Adenine"
[23] "Hypoxanthine"
[24] "Urocanic Acid"
[25] "O-Phosphoethanolamine"
[26] "Glutamine"
[27] "Lysine"
[28] "Glutamic Acid"
[29] "Methionine"
[30] "2-Phenylglycine"
[31] "Xanthine"
[32] "Ribitol"
[33] "3-Sulfinoalanine"
[34] "3-Hydroxyanthranilic Acid"
[35] "Histidine"
[36] "Orotic Acid"
[37] "Allantoin"
[38] "N-Methylglutamic Acid"
[39] "Phenylalanine"
[40] "Pyridoxine"
[41] "DHAP"
[42] "G3P"
[43] "Glycerol 1-Phosphate / Glycerol 3-Phosphate"
[44] "Arginine"
[45] "N-Carbamoyl-L-aspartic Acid"
[46] "Tyrosine"
[47] "D-Galactitol"
[48] "N-alpha-Acetylglutamine"
[49] "Glucuronic Acid"
[50] "Tryptophan"
[51] "Phosphocreatine"
[52] "Glycerylphosphorylethanolamine"
[53] "O-Succinylhomoserine"
[54] "Pantothenic Acid"
[55] "GlcNAc"
[56] "Cystathionine"
[57] "Deoxycytidine"
[58] "Cytidine"
[59] "Uridine"
[60] "Palmitoleic Acid"
[61] "Glycerol 1-phosphoserine"
[62] "D-Glucose 6-phosphate"
[63] "Thiamine"
[64] "Adenosine"
[65] "Inosine"
[66] "10Z-Heptadecenoic Acid"
[67] "Myoinositol-methyl-phosphate"
[68] "Oleic Acid"
[69] "Guanosine"
[70] "Xanthosine"
[71] "1-Monomyristin"
[72] "2,3-cyclic CMP"
[73] "Glutathione"
[74] "Neu5Ac"
[75] "CMP"
[76] "UMP"
[77] "3-Phosphoglyceroinositol"
[78] "AMP"
[79] "GMP"
[80] "Succinoadenosine"
[81] "SAH"
[82] "CDP"
[83] "ADP"
[84] "GDP"
[85] "LysoPE(18:0)"
[86] "CTP"
[87] "UTP"
[88] "CDP-Choline"
[89] "ATP"
[90] "GTP"
[91] "GDP-Glucose"
[92] "UDP-GalNAc"
[93] "UDP-GlcNAc"
[94] "GSSG"
[95] "NAD"
[96] "NADH"
[97] "NADP"
[98] "CoA"
[99] "PC(36:4)"
[100] "PC(36:4)"
# percent of cell / neg compounds found in cell / pos
round(nrow(fa.cell.cmpnd.join) * 100 / nrow(fa.cell.neg.compound.info), 1)
[1] 47.4
# percent of cell / neg compounds found in cell / pos
round(nrow(fa.cell.cmpnd.join) * 100 / nrow(fa.cell.pos.compound.info), 1)
[1] 66.2
### FA media join ###
fa.med.cmpnd.join <- fa.med.neg.compound.info %>%
inner_join(fa.med.pos.compound.info, by = "cas_id", suffix = c(".m.n", ".m.p")) %>%
select(
contains("cas_id"), contains("short"),
contains("full"), starts_with("formula"),
starts_with("mass"), starts_with("rt")
)
# compound names - found in pos and neg mode / media
print(fa.med.cmpnd.join$compound_full.m.n)
[1] "Glycine" "Alanine"
[3] "Serine" "Uracil"
[5] "Creatinine" "Proline"
[7] "Valine" "Threonine"
[9] "Taurine" "4-Oxoproline"
[11] "Isoleucine" "Leucine"
[13] "Asparagine" "Hypoxanthine"
[15] "Glutamine" "Glutamic Acid"
[17] "Methionine" "Xanthine"
[19] "Allantoin" "Phenylalanine"
[21] "Uric Acid" "Pyridoxine"
[23] "Tyrosine" "D-Galactitol"
[25] "Phenylacetylglycine" "Cysteine-S-sulfonic Acid"
[27] "Pantothenic Acid" "Uridine"
[29] "Folic Acid"
# percent of media / neg compounds found in media / pos
round(nrow(fa.med.cmpnd.join) * 100 / nrow(fa.med.neg.compound.info), 1)
[1] 46.8
# percent of media / pos compounds found in media / neg
round(nrow(fa.med.cmpnd.join) * 100 / nrow(fa.med.pos.compound.info), 1)
[1] 43.3
### FA all sets join ###
fa.all.cmpnd.join <- fa.cell.cmpnd.join %>%
inner_join(fa.med.cmpnd.join, by = "cas_id") %>%
select(
contains("cas_id"), contains("short"),
contains("full"), starts_with("formula"),
starts_with("mass"), starts_with("rt")
)
nrow(fa.all.cmpnd.join)
[1] 24
# check for any mass issues between all 4 modes
fa.all.cmpnd.join %>%
select(contains("mass")) %>%
ggpairs()
# RT issues?
fa.all.cmpnd.join %>%
select(starts_with("rt")) %>%
ggpairs()
Q: Do any of the samples have greater than 20% missing (NA) compound abundances, out of all of the features in the dataset?
MissingPerSamplePlot(fa.cell.neg.raw, "FAnC") +
ggtitle("Missing Per Sample\nFA-only / Cells / Neg Mode")
MissingPerSamplePlot(fa.cell.pos.raw, "FApC") +
ggtitle("Missing Per Sample\nFA-only / Cells / Pos Mode")
MissingPerSamplePlot(fa.med.neg.raw, "FAnM") +
ggtitle("Missing Per Sample\nFA-only / Media / Neg Mode")
MissingPerSamplePlot(fa.med.pos.raw, "FApM") +
ggtitle("Missing Per Sample\nFA-only / Media / Pos Mode")
(fa.cell.neg.cmpnd.to.excl <- MissingPerCompound(fa.cell.neg.raw, "FAnC") %>%
filter(percent_na > 20))
# A tibble: 1 x 4
Compound na_count n_samples percent_na
<chr> <int> <int> <dbl>
1 FAnC52 7 24 29.2
(fa.cell.pos.cmpnd.to.excl <- MissingPerCompound(fa.cell.pos.raw, "FApC") %>%
filter(percent_na > 20))
# A tibble: 1 x 4
Compound na_count n_samples percent_na
<chr> <int> <int> <dbl>
1 FApC23 8 24 33.3
MissingPerCompound(fa.med.neg.raw, "FAnM") %>%
filter(percent_na > 20)
# A tibble: 0 x 4
# ... with 4 variables: Compound <chr>, na_count <int>, n_samples <int>,
# percent_na <dbl>
MissingPerCompound(fa.med.pos.raw, "FApM") %>%
filter(percent_na > 20)
# A tibble: 0 x 4
# ... with 4 variables: Compound <chr>, na_count <int>, n_samples <int>,
# percent_na <dbl>
fa.cell.neg.raw.grp.mean <- fa.cell.neg.raw %>%
group_by(Group) %>%
summarize_at(vars(matches("FAnC")), mean, na.rm = TRUE) %>%
gather(key = "Compound", value = "Grp_mean_abun", -Group)
fa.cell.neg.raw.grp.mean %>%
ggplot(aes(log2(Grp_mean_abun), color = Group)) +
geom_density(size = 2, alpha = 0.8) +
ggtitle("Distribution of compound means\nFA-only / Cells / Negative Mode\nGrouped by sample type")
fa.cell.neg.raw.grp.mean.order <- fa.cell.neg.raw.grp.mean %>%
filter(Group == "sample") %>%
arrange(Grp_mean_abun)
fa.cell.neg.raw %>%
select(Samples, Group, starts_with("FAnC")) %>%
gather("Compound", value = "Abundance", -c(Samples, Group)) %>%
mutate(Cmpnd_sort = factor(Compound, levels = fa.cell.neg.raw.grp.mean.order$Compound)) %>%
ggplot(aes(Cmpnd_sort, log2(Abundance), color = Group, group = Samples)) +
geom_line(alpha = 0.1, size = 1) +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
xlab("Compound") +
geom_line(
data = fa.cell.neg.raw.grp.mean %>%
mutate(Cmpnd_sort = factor(Compound, levels = fa.cell.neg.raw.grp.mean.order$Compound)),
aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group),
size = 0.5
) +
ggtitle("Profile Plot of all compound abundances\nWith average per sample type overlaid\nFA-only / Cells / Negative Mode")
fa.cell.neg.raw.grp.mean %>%
mutate(Cmpnd_sort = factor(Compound, levels = fa.cell.neg.raw.grp.mean.order$Compound)) %>%
ggplot(aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group)) +
geom_point(size = 1, alpha = 0.8) +
geom_line(alpha = 0.8) +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
xlab("Compound") +
ylab("log2(Sample Type Mean)") +
ggtitle("Profile Plot of compound means by sample type only\nFA-only / Cells / Negative Mode")
fa.cell.neg.raw.grp.diff <- fa.cell.neg.raw.grp.mean %>%
spread(Group, Grp_mean_abun) %>%
mutate(smpl_solv_diff = sample / solv)
fa.cell.neg.raw.grp.diff %>%
ggplot(aes(log2(smpl_solv_diff))) +
geom_histogram(bins = 50)
# include compounds with FC > 2.5 or FC is NA (indication of NA in solv)
fa.cell.neg.cmpnd.to.incl <- fa.cell.neg.raw.grp.diff %>%
filter(smpl_solv_diff > 2.5 | is.na(smpl_solv_diff)) %>%
filter(!(Compound %in% fa.cell.neg.cmpnd.to.excl$Compound))
# how many compound were there in the raw dataset?
nrow(fa.cell.neg.raw.grp.diff)
[1] 211
# how many compounds are retained for further analysis?
nrow(fa.cell.neg.cmpnd.to.incl)
[1] 204
fa.cell.pos.raw.grp.mean <- fa.cell.pos.raw %>%
group_by(Group) %>%
summarize_at(vars(matches("FApC")), mean, na.rm = TRUE) %>%
gather(key = "Compound", value = "Grp_mean_abun", -Group)
fa.cell.pos.raw.grp.mean %>%
ggplot(aes(log2(Grp_mean_abun), color = Group)) +
geom_density(size = 2, alpha = 0.8) +
ggtitle("Distribution of compound means\nFA-only / Cells / Positive Mode\nGrouped by sample type")
fa.cell.pos.raw.grp.mean.order <- fa.cell.pos.raw.grp.mean %>%
filter(Group == "sample") %>%
arrange(Grp_mean_abun)
fa.cell.pos.raw %>%
select(Samples, Group, starts_with("FApC")) %>%
gather("Compound", value = "Abundance", -c(Samples, Group)) %>%
mutate(Cmpnd_sort = factor(Compound, levels = fa.cell.pos.raw.grp.mean.order$Compound)) %>%
ggplot(aes(Cmpnd_sort, log2(Abundance), color = Group, group = Samples)) +
geom_line(alpha = 0.1, size = 1) +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
xlab("Compound") +
geom_line(
data = fa.cell.pos.raw.grp.mean %>%
mutate(Cmpnd_sort = factor(Compound, levels = fa.cell.pos.raw.grp.mean.order$Compound)),
aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group),
size = 0.5
) +
ggtitle("Profile Plot of all compound abundances\nWith average per sample type overlaid\nFA-only / Cells / Positive Mode")
fa.cell.pos.raw.grp.mean %>%
mutate(Cmpnd_sort = factor(Compound, levels = fa.cell.pos.raw.grp.mean.order$Compound)) %>%
ggplot(aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group)) +
geom_point(size = 1, alpha = 0.8) +
geom_line(alpha = 0.8) +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
xlab("Compound") +
ylab("log2(Sample Type Mean)") +
ggtitle("Profile Plot of compound means by sample type only\nFA-only / Cells / Positive Mode")
fa.cell.pos.raw.grp.diff <- fa.cell.pos.raw.grp.mean %>%
spread(Group, Grp_mean_abun) %>%
mutate(smpl_solv_diff = sample / solv)
fa.cell.pos.raw.grp.diff %>%
ggplot(aes(log2(smpl_solv_diff))) +
geom_histogram(bins = 50)
fa.cell.pos.cmpnd.to.incl <- fa.cell.pos.raw.grp.diff %>%
filter(smpl_solv_diff > 2.5 | is.na(smpl_solv_diff)) %>%
filter(!(Compound %in% fa.cell.pos.cmpnd.to.excl$Compound))
# compounds in the original set
nrow(fa.cell.pos.raw.grp.diff)
[1] 151
# compounds left in analysis
nrow(fa.cell.pos.cmpnd.to.incl)
[1] 145
fa.med.neg.raw.grp.mean <- fa.med.neg.raw %>%
group_by(Group) %>%
summarize_at(vars(matches("FAnM")), mean, na.rm = TRUE) %>%
gather(key = "Compound", value = "Grp_mean_abun", -Group)
fa.med.neg.raw.grp.mean %>%
ggplot(aes(log2(Grp_mean_abun), color = Group)) +
geom_density(size = 2, alpha = 0.8) +
ggtitle("Distribution of compound means\nFA-only / Media / Negative Mode\nGrouped by sample type")
fa.med.neg.raw.grp.mean.order <- fa.med.neg.raw.grp.mean %>%
filter(Group == "sample") %>%
arrange(Grp_mean_abun)
fa.med.neg.raw %>%
select(Samples, Group, starts_with("FAnM")) %>%
gather("Compound", value = "Abundance", -c(Samples, Group)) %>%
mutate(Cmpnd_sort = factor(Compound, levels = fa.med.neg.raw.grp.mean.order$Compound)) %>%
ggplot(aes(Cmpnd_sort, log2(Abundance), color = Group, group = Samples)) +
geom_line(alpha = 0.1, size = 2) +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
xlab("Compound") +
geom_line(
data = fa.med.neg.raw.grp.mean %>%
mutate(Cmpnd_sort = factor(Compound, levels = fa.med.neg.raw.grp.mean.order$Compound)),
aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group),
size = 0.5
) +
ggtitle("Profile Plot of all compound abundances\nWith average per sample type overlaid\nFA-only / Media / Negative Mode")
fa.med.neg.raw.grp.mean %>%
mutate(Cmpnd_sort = factor(Compound, levels = fa.med.neg.raw.grp.mean.order$Compound)) %>%
ggplot(aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group)) +
geom_point(size = 1, alpha = 0.8) +
geom_line(alpha = 0.8) +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
xlab("Compound") +
ylab("log2(Sample Type Mean)") +
ggtitle("Profile Plot of compound means by sample type only\nFA-only / Media / Negative Mode")
fa.med.neg.raw.grp.diff <- fa.med.neg.raw.grp.mean %>%
spread(Group, Grp_mean_abun) %>%
mutate(smpl_solv_diff = sample / solv)
fa.med.neg.raw.grp.diff %>%
ggplot(aes(log2(smpl_solv_diff))) +
geom_histogram(bins = 25)
fa.med.neg.cmpnd.to.incl <- fa.med.neg.raw.grp.diff %>%
filter(smpl_solv_diff > 2.5 | is.na(smpl_solv_diff))
# original number
nrow(fa.med.neg.raw.grp.diff)
[1] 62
# number of metabolites after filtering
nrow(fa.med.neg.cmpnd.to.incl)
[1] 48
fa.med.pos.raw.grp.mean <- fa.med.pos.raw %>%
group_by(Group) %>%
summarize_at(vars(matches("FApM")), mean, na.rm = TRUE) %>%
gather(key = "Compound", value = "Grp_mean_abun", -Group)
fa.med.pos.raw.grp.mean %>%
ggplot(aes(log2(Grp_mean_abun), color = Group)) +
geom_density(size = 2, alpha = 0.8) +
ggtitle("Distribution of compound means\nFA-only / Media / Positive Mode\nGrouped by sample type")
fa.med.pos.raw.grp.mean.order <- fa.med.pos.raw.grp.mean %>%
filter(Group == "sample") %>%
arrange(Grp_mean_abun)
fa.med.pos.raw %>%
select(Samples, Group, starts_with("FApM")) %>%
gather("Compound", value = "Abundance", -c(Samples, Group)) %>%
mutate(Cmpnd_sort = factor(Compound, levels = fa.med.pos.raw.grp.mean.order$Compound)) %>%
ggplot(aes(Cmpnd_sort, log2(Abundance), color = Group, group = Samples)) +
geom_line(alpha = 0.1, size = 2) +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
xlab("Compound") +
geom_line(
data = fa.med.pos.raw.grp.mean %>%
mutate(Cmpnd_sort = factor(Compound, levels = fa.med.pos.raw.grp.mean.order$Compound)),
aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group),
size = 0.5
) +
ggtitle("Profile Plot of all compound abundances\nWith average per sample type overlaid\nFA-only / Media / Positive Mode")
fa.med.pos.raw.grp.mean %>%
mutate(Cmpnd_sort = factor(Compound, levels = fa.med.pos.raw.grp.mean.order$Compound)) %>%
ggplot(aes(Cmpnd_sort, log2(Grp_mean_abun), color = Group, group = Group)) +
geom_point(size = 1, alpha = 0.8) +
geom_line(alpha = 0.8) +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
xlab("Compound") +
ylab("log2(Sample Type Mean)") +
ggtitle("Profile Plot of compound means by sample type only\nFA-only / Media / Positive Mode")
fa.med.pos.raw.grp.diff <- fa.med.pos.raw.grp.mean %>%
spread(Group, Grp_mean_abun) %>%
mutate(smpl_solv_diff = sample / solv)
fa.med.pos.raw.grp.diff %>%
ggplot(aes(log2(smpl_solv_diff))) +
geom_histogram(bins = 25)
fa.med.pos.cmpnd.to.incl <- fa.med.pos.raw.grp.diff %>%
filter(smpl_solv_diff > 2.5 | is.na(smpl_solv_diff))
# original number of metabolites
nrow(fa.med.pos.raw.grp.diff)
[1] 67
# number left after filtering
nrow(fa.med.pos.cmpnd.to.incl)
[1] 47
fa.cell.neg.noNA <- fa.cell.neg.raw %>%
select(Samples:Experiment, one_of(fa.cell.neg.cmpnd.to.incl$Compound)) %>%
ReplaceNAwMinLogTransformSingle("FAnC")
fa.cell.pos.noNA <- fa.cell.pos.raw %>%
select(Samples:Experiment, one_of(fa.cell.pos.cmpnd.to.incl$Compound)) %>%
ReplaceNAwMinLogTransformSingle("FApC")
fa.med.neg.noNA <- fa.med.neg.raw %>%
select(Samples:Experiment, one_of(fa.med.neg.cmpnd.to.incl$Compound)) %>%
ReplaceNAwMinLogTransformSingle("FAnM")
fa.med.pos.noNA <- fa.med.pos.raw %>%
select(Samples:Experiment, one_of(fa.med.pos.cmpnd.to.incl$Compound)) %>%
ReplaceNAwMinLogTransformSingle("FApM")
fa.cell.neg.noNA.gathered <- fa.cell.neg.noNA %>%
gather(
key = "Metabolite", "Abundance",
which(colnames(fa.cell.neg.noNA) == "FAnC1"):ncol(fa.cell.neg.noNA)
)
fa.cell.neg.noNA.gathered %>%
ggplot(aes(Samples, Abundance, fill = Group)) +
geom_boxplot() +
geom_boxplot(aes(color = Group), fatten = NULL, fill = NA, coef = 0, outlier.alpha = 0, show.legend = FALSE) +
theme(axis.text.x = element_text(angle = 90)) +
ylab("log2(Abundance)") +
ggtitle("Boxplot of compound abundances\nAll samples\nFA-only / Cells / Negative Mode")
fa.cell.neg.noNA.gathered %>%
ggplot(aes(y = Samples, x = Abundance, fill = Group)) +
geom_density_ridges(scale = 15) +
theme_ridges() +
scale_y_discrete(expand = c(0.01, 0)) +
ggtitle("Ridge plot of compound abundances\nAll samples\nFA-only / Cells / Negative Mode")
Picking joint bandwidth of 0.997
fa.cell.neg.noNA.gathered %>%
filter(Group == "sample") %>%
ggplot(aes(y = Samples, x = Abundance, fill = Treatment)) +
geom_density_ridges(scale = 10) +
theme_ridges() +
scale_y_discrete(expand = c(0.01, 0)) +
ggtitle("Ridge plot of compound abundances\nExperimental samples only\nFA-only / Cells / Negative Mode")
Picking joint bandwidth of 0.931
fa.cell.neg.noNA.gathered %>%
filter(Group == "sample") %>%
ggplot(aes(Abundance, group = Samples, color = Treatment)) +
geom_density(alpha = 0.8, size = 0.75) +
xlab("log2(Abundance)") +
ggtitle("Density plot of compound abundances\nExperimental samples only\nFA-only / Cells / Negative Mode")
fa.cell.pos.noNA.gathered <- fa.cell.pos.noNA %>%
gather(
key = "Metabolite", "Abundance",
which(colnames(fa.cell.pos.noNA) == "FApC1"):ncol(fa.cell.pos.noNA)
)
fa.cell.pos.noNA.gathered %>%
ggplot(aes(Samples, Abundance, fill = Group)) +
geom_boxplot() +
geom_boxplot(aes(color = Group), fatten = NULL, fill = NA, coef = 0, outlier.alpha = 0, show.legend = FALSE) +
theme(axis.text.x = element_text(angle = 90)) +
ylab("log2(Abundance)") +
ggtitle("Boxplot of compound abundances\nAll samples\nFA-only / Cells / Positive Mode")
fa.cell.pos.noNA.gathered %>%
ggplot(aes(y = Samples, x = Abundance, fill = Group)) +
geom_density_ridges(scale = 15) +
theme_ridges() +
scale_y_discrete(expand = c(0.01, 0)) +
ggtitle("Ridge plot of compound abundances\nAll samples\nFA-only / Cells / Positive Mode")
Picking joint bandwidth of 1.03
fa.cell.pos.noNA.gathered %>%
filter(Group == "sample") %>%
ggplot(aes(y = Samples, x = Abundance, fill = Treatment)) +
geom_density_ridges(scale = 10) +
theme_ridges() +
scale_y_discrete(expand = c(0.01, 0)) +
ggtitle("Ridge plot of compound abundances\nExperimental samples only\nFA-only / Cells / Positive Mode")
Picking joint bandwidth of 1.03
fa.cell.pos.noNA.gathered %>%
filter(Group == "sample") %>%
ggplot(aes(Abundance, group = Samples, color = Treatment)) +
geom_density(alpha = 0.8, size = 0.75) +
xlab("log2(Abundance)") +
ggtitle("Density plot of compound abundances\nExperimental samples only\nFA-only / Cells / Positive Mode")
fa.med.neg.noNA.gathered <- fa.med.neg.noNA %>%
gather(
key = "Metabolite", "Abundance",
which(colnames(fa.med.neg.noNA) == "FAnM1"):ncol(fa.med.neg.noNA)
)
fa.med.neg.noNA.gathered %>%
ggplot(aes(Samples, Abundance, fill = Group)) +
geom_boxplot() +
geom_boxplot(aes(color = Group), fatten = NULL, fill = NA, coef = 0, outlier.alpha = 0, show.legend = FALSE) +
theme(axis.text.x = element_text(angle = 90)) +
ylab("log2(Abundance)") +
ggtitle("Boxplot of compound abundances\nAll samples\nFA-only / Media / Negative Mode")
fa.med.neg.noNA.gathered %>%
ggplot(aes(y = Samples, x = Abundance, fill = Group)) +
geom_density_ridges(scale = 15) +
theme_ridges() +
scale_y_discrete(expand = c(0.01, 0)) +
ggtitle("Ridge plot of compound abundances\nAll samples\nFA-only / Media / Negative Mode")
Picking joint bandwidth of 0.872
fa.med.neg.noNA.gathered %>%
filter(Group == "sample") %>%
ggplot(aes(y = Samples, x = Abundance, fill = Treatment)) +
geom_density_ridges(scale = 10) +
theme_ridges() +
scale_y_discrete(expand = c(0.01, 0)) +
ggtitle("Ridge plot of compound abundances\nExperimental samples only\nFA-only / Media / Negative Mode")
Picking joint bandwidth of 0.866
fa.med.neg.noNA.gathered %>%
filter(Group == "sample") %>%
ggplot(aes(Abundance, group = Samples, color = Treatment)) +
geom_density(alpha = 0.8, size = 0.75) +
xlab("log2(Abundance)") +
ggtitle("Density plot of compound abundances\nExperimental samples only\nFA-only / Media / Negative Mode")
fa.med.pos.noNA.gathered <- fa.med.pos.noNA %>%
gather(
key = "Metabolite", "Abundance",
which(colnames(fa.med.pos.noNA) == "FApM1"):ncol(fa.med.pos.noNA)
)
fa.med.pos.noNA.gathered %>%
ggplot(aes(Samples, Abundance, fill = Group)) +
geom_boxplot() +
geom_boxplot(aes(color = Group), fatten = NULL, fill = NA, coef = 0, outlier.alpha = 0, show.legend = FALSE) +
theme(axis.text.x = element_text(angle = 90)) +
ylab("log2(Abundance)") +
ggtitle("Boxplot of compound abundances\nAll samples\nFA-only / Media / Postive Mode")
fa.med.pos.noNA.gathered %>%
ggplot(aes(y = Samples, x = Abundance, fill = Group)) +
geom_density_ridges(scale = 15) +
theme_ridges() +
scale_y_discrete(expand = c(0.01, 0)) +
ggtitle("Ridge plot of compound abundances\nAll samples\nFA-only / Media / Positive Mode")
Picking joint bandwidth of 1.36
fa.med.pos.noNA.gathered %>%
filter(Group == "sample") %>%
ggplot(aes(y = Samples, x = Abundance, fill = Treatment)) +
geom_density_ridges(scale = 10) +
theme_ridges() +
scale_y_discrete(expand = c(0.01, 0)) +
ggtitle("Ridge plot of compound abundances\nExperimental samples only\nFA-only / Media / Positive Mode")
Picking joint bandwidth of 1.34
fa.med.pos.noNA.gathered %>%
filter(Group == "sample") %>%
ggplot(aes(Abundance, group = Samples, color = Treatment)) +
geom_density(alpha = 0.8, size = 0.75) +
xlab("log2(Abundance)") +
ggtitle("Density plot of compound abundances\nExperimental samples only\nFA-only / Media / Positive Mode")
Some plots to understand the relationship between the samples, QC samples, solvent, and empty samples in some cases.
### PCA on all Samples ###
fa.cell.neg.full.pca <- fa.cell.neg.noNA %>%
select(starts_with("FAnC")) %>%
# good idea to center data before pca, but scaling should not be necessary
mutate_all(scale, center = TRUE, scale = FALSE) %>%
as.matrix() %>%
prcomp()
# plot variance explained by each new principal component
plot(
(fa.cell.neg.full.pca$sdev ^ 2) * 100 / sum(fa.cell.neg.full.pca$sdev ^ 2),
xlab = "Principal Component",
ylab = "Variance Explained",
main = "Percent variance explained by each principal component\nAll samples only\nFA-only / Cells / Negative Mode",
type = "b"
)
fa.cell.neg.full.pca.x <- as.data.frame(fa.cell.neg.full.pca$x)
row.names(fa.cell.neg.full.pca.x) <- fa.cell.neg.noNA$Samples
fa.cell.neg.full.pca.x <- fa.cell.neg.full.pca.x %>%
bind_cols(fa.cell.neg.noNA %>% select(Group:Experiment))
fa.cell.neg.full.pca.x %>%
ggplot(aes(x = PC1, y = PC2, color = Group)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC1 (92.3% Var)") +
ylab("PC2 (3.7%)") +
ggtitle("Principal Component Analysis\nAll Samples\nFA-only / Cells / Negative Mode")
### Samples and QC ###
fa.cell.neg.smpl.QC.pca <- fa.cell.neg.noNA %>%
filter(Group == "sample" | Group == "QC") %>%
select(starts_with("FAnC")) %>%
mutate_all(scale, center = TRUE, scale = FALSE) %>%
as.matrix() %>%
prcomp()
plot(
(fa.cell.neg.smpl.QC.pca$sdev ^ 2) * 100 / sum(fa.cell.neg.smpl.QC.pca$sdev ^ 2),
xlab = "Principal Component",
ylab = "Variance Explained",
main = "Percent variance explained by each principal component\nSamples and QC\nFA-only / Cells / Negative Mode",
type = "b"
)
fa.cell.neg.smpl.QC.pca.x <- as.data.frame(fa.cell.neg.smpl.QC.pca$x)
fa.cell.neg.smpl.QC.pca.x <- fa.cell.neg.smpl.QC.pca.x %>%
bind_cols(
fa.cell.neg.noNA %>%
filter(Group == "sample" | Group == "QC") %>%
select(Samples, Group:Experiment)
)
row.names(fa.cell.neg.smpl.QC.pca.x) <- fa.cell.neg.smpl.QC.pca.x$Samples
fa.cell.neg.smpl.QC.pca.x %>%
ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC1 (36.2% Var)") +
ylab("PC2 (17.0%)") +
ggtitle("Principal Component Analysis\nSamples and QC\nFA-only / Cells / Negative Mode")
fa.cell.neg.smpl.QC.pca.x %>%
ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC3 (11.9% Var)") +
ylab("PC4 (7.9%)") +
ggtitle("Principal Component Analysis\nSamples and QC\nFA-only / Cells / Negative Mode")
### Experimental Samples Only ###
fa.cell.neg.smpl.pca <- fa.cell.neg.noNA %>%
filter(Group == "sample") %>%
select(starts_with("FAnC")) %>%
mutate_all(scale, center = TRUE, scale = FALSE) %>%
as.matrix() %>%
prcomp()
plot(
(fa.cell.neg.smpl.pca$sdev ^ 2) * 100 / sum(fa.cell.neg.smpl.pca$sdev ^ 2),
xlab = "Principal Component",
ylab = "Variance Explained",
main = "Percent variance explained by each principal component\nExperimental samples only\nFA-only / Cells / Negative Mode",
type = "b"
)
fa.cell.neg.smpl.pca.x <- as.data.frame(fa.cell.neg.smpl.pca$x)
fa.cell.neg.smpl.pca.x <- fa.cell.neg.smpl.pca.x %>%
bind_cols(
fa.cell.neg.noNA %>%
filter(Group == "sample") %>%
select(Samples, Group:Experiment)
)
row.names(fa.cell.neg.smpl.pca.x) <- fa.cell.neg.smpl.pca.x$Samples
fa.cell.neg.smpl.pca.x %>%
ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC1 (34.7% Var)") +
ylab("PC2 (20.3%)") +
ggtitle("Principal Component Analysis\nExperimental samples only\nFA-only / Cells / Negative Mode")
fa.cell.neg.smpl.pca.x %>%
ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC3 (11.5% Var)") +
ylab("PC4 (8.6%)") +
ggtitle("Principal Component Analysis\nExperimental samples only\nFA-only / Cells / Negative Mode")
### PCA on all Samples ###
fa.cell.pos.full.pca <- fa.cell.pos.noNA %>%
select(starts_with("FApC")) %>%
# good idea to center data before pca, but scaling should not be necessary
mutate_all(scale, center = TRUE, scale = FALSE) %>%
as.matrix() %>%
prcomp()
# plot variance explained by each new principal component
plot(
(fa.cell.pos.full.pca$sdev ^ 2) * 100 / sum(fa.cell.pos.full.pca$sdev ^ 2),
xlab = "Principal Component",
ylab = "Variance Explained",
main = "Percent variance explained by each principal component\nAll samples only\nFA-only / Cells / Positive Mode",
type = "b"
)
fa.cell.pos.full.pca.x <- as.data.frame(fa.cell.pos.full.pca$x)
row.names(fa.cell.pos.full.pca.x) <- fa.cell.pos.noNA$Samples
fa.cell.pos.full.pca.x <- fa.cell.pos.full.pca.x %>%
bind_cols(fa.cell.pos.noNA %>% select(Group:Experiment))
fa.cell.pos.full.pca.x %>%
ggplot(aes(x = PC1, y = PC2, color = Group)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC1 (89.5% Var)") +
ylab("PC2 (3.4%)") +
ggtitle("Principal Component Analysis\nAll Samples\nFA-only / Cells / Positive Mode")
### Samples and QC ###
fa.cell.pos.smpl.QC.pca <- fa.cell.pos.noNA %>%
filter(Group == "sample" | Group == "QC") %>%
select(starts_with("FApC")) %>%
mutate_all(scale, center = TRUE, scale = FALSE) %>%
as.matrix() %>%
prcomp()
plot(
(fa.cell.pos.smpl.QC.pca$sdev ^ 2) * 100 / sum(fa.cell.pos.smpl.QC.pca$sdev ^ 2),
xlab = "Principal Component",
ylab = "Variance Explained",
main = "Percent variance explained by each principal component\nSamples and QC\nFA-only / Cells / Positive Mode",
type = "b"
)
fa.cell.pos.smpl.QC.pca.x <- as.data.frame(fa.cell.pos.smpl.QC.pca$x)
fa.cell.pos.smpl.QC.pca.x <- fa.cell.pos.smpl.QC.pca.x %>%
bind_cols(
fa.cell.pos.noNA %>%
filter(Group == "sample" | Group == "QC") %>%
select(Samples, Group:Experiment)
)
row.names(fa.cell.pos.smpl.QC.pca.x) <- fa.cell.pos.smpl.QC.pca.x$Samples
fa.cell.pos.smpl.QC.pca.x %>%
ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC1 (48.3% Var)") +
ylab("PC2 (12.9%)") +
ggtitle("Principal Component Analysis\nSamples and QC\nFA-only / Cells / Positive Mode")
fa.cell.pos.smpl.QC.pca.x %>%
ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC3 (12.3% Var)") +
ylab("PC4 (8.4%)") +
ggtitle("Principal Component Analysis\nSamples and QC\nFA-only / Cells / Positive Mode")
### Experimental Samples Only ###
fa.cell.pos.smpl.pca <- fa.cell.pos.noNA %>%
filter(Group == "sample") %>%
select(starts_with("FApC")) %>%
mutate_all(scale, center = TRUE, scale = FALSE) %>%
as.matrix() %>%
prcomp()
plot(
(fa.cell.pos.smpl.pca$sdev ^ 2) * 100 / sum(fa.cell.pos.smpl.pca$sdev ^ 2),
xlab = "Principal Component",
ylab = "Variance Explained",
main = "Percent variance explained by each principal component\nExperimental samples only\nFA-only / Cells / Positive Mode",
type = "b"
)
fa.cell.pos.smpl.pca.x <- as.data.frame(fa.cell.pos.smpl.pca$x)
fa.cell.pos.smpl.pca.x <- fa.cell.pos.smpl.pca.x %>%
bind_cols(
fa.cell.pos.noNA %>%
filter(Group == "sample") %>%
select(Samples, Group:Experiment)
)
row.names(fa.cell.pos.smpl.pca.x) <- fa.cell.pos.smpl.pca.x$Samples
fa.cell.pos.smpl.pca.x %>%
ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC1 (51.3% Var)") +
ylab("PC2 (15.7%)") +
ggtitle("Principal Component Analysis\nExperimental samples only\nFA-only / Cells / Positive Mode")
fa.cell.pos.smpl.pca.x %>%
ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC3 (11.1% Var)") +
ylab("PC4 (6.8%)") +
ggtitle("Principal Component Analysis\nExperimental samples only\nFA-only / Cells / Positive Mode")
fa.med.neg.full.pca <- fa.med.neg.noNA %>%
select(starts_with("FAnM")) %>%
mutate_all(scale, center = TRUE, scale = FALSE) %>%
as.matrix() %>%
prcomp()
plot(
(fa.med.neg.full.pca$sdev ^ 2) * 100 / sum(fa.med.neg.full.pca$sdev ^ 2),
xlab = "Principal Component",
ylab = "Variance Explained",
main = "Percent variance explained by each principal component\nAll samples only\nFA-only / Media / Negative Mode",
type = "b"
)
fa.med.neg.full.pca.x <- as.data.frame(fa.med.neg.full.pca$x)
row.names(fa.med.neg.full.pca.x) <- fa.med.neg.noNA$Samples
fa.med.neg.full.pca.x <- fa.med.neg.full.pca.x %>%
bind_cols(fa.med.neg.noNA %>% select(Group:Experiment))
fa.med.neg.full.pca.x %>%
ggplot(aes(x = PC1, y = PC2, color = Group)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC1 (94.0% Var)") +
ylab("PC2 (2.2%)") +
ggtitle("Principal Component Analysis\nAll Samples\nFA-only / Media / Negative Mode")
### Samples and QC ###
fa.med.neg.smpl.QC.pca <- fa.med.neg.noNA %>%
filter(Group == "sample" | Group == "QC") %>%
select(starts_with("FAnM")) %>%
mutate_all(scale, center = TRUE, scale = FALSE) %>%
as.matrix() %>%
prcomp()
plot(
(fa.med.neg.smpl.QC.pca$sdev ^ 2) * 100 / sum(fa.med.neg.smpl.QC.pca$sdev ^ 2),
xlab = "Principal Component",
ylab = "Variance Explained",
main = "Percent variance explained by each principal component\nSamples and QC\nFA-only / Media / Negative Mode",
type = "b"
)
fa.med.neg.smpl.QC.pca.x <- as.data.frame(fa.med.neg.smpl.QC.pca$x)
fa.med.neg.smpl.QC.pca.x <- fa.med.neg.smpl.QC.pca.x %>%
bind_cols(
fa.med.neg.noNA %>%
filter(Group == "sample" | Group == "QC") %>%
select(Samples, Group:Experiment)
)
row.names(fa.med.neg.smpl.QC.pca.x) <- fa.med.neg.smpl.QC.pca.x$Samples
fa.med.neg.smpl.QC.pca.x %>%
ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC1 (52.3% Var)") +
ylab("PC2 (25.9%)") +
ggtitle("Principal Component Analysis\nSamples and QC\nFA-only / Media / Negative Mode")
fa.med.neg.smpl.QC.pca.x %>%
ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC3 (11.9% Var)") +
ylab("PC4 (2.8%)") +
ggtitle("Principal Component Analysis\nSamples and QC\nFA-only / Media / Negative Mode")
### Experimental Samples Only ###
fa.med.neg.smpl.pca <- fa.med.neg.noNA %>%
filter(Group == "sample") %>%
select(starts_with("FAnM")) %>%
mutate_all(scale, center = TRUE, scale = FALSE) %>%
as.matrix() %>%
prcomp()
plot(
(fa.med.neg.smpl.pca$sdev ^ 2) * 100 / sum(fa.med.neg.smpl.pca$sdev ^ 2),
xlab = "Principal Component",
ylab = "Variance Explained",
main = "Percent variance explained by each principal component\nExperimental samples only\nFA-only / Media / Negative Mode",
type = "b"
)
fa.med.neg.smpl.pca.x <- as.data.frame(fa.med.neg.smpl.pca$x)
fa.med.neg.smpl.pca.x <- fa.med.neg.smpl.pca.x %>%
bind_cols(
fa.med.neg.noNA %>%
filter(Group == "sample") %>%
select(Samples, Group:Experiment)
)
row.names(fa.med.neg.smpl.pca.x) <- fa.med.neg.smpl.pca.x$Samples
fa.med.neg.smpl.pca.x %>%
ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC1 (51.2% Var)") +
ylab("PC2 (26.2%)") +
ggtitle("Principal Component Analysis\nExperimental samples only\nFA-only / Media / Negative Mode")
fa.med.neg.smpl.pca.x %>%
ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC3 (13.4% Var)") +
ylab("PC4 (3.1%)") +
ggtitle("Principal Component Analysis\nExperimental samples only\nFA-only / Media / Negative Mode")
fa.med.pos.full.pca <- fa.med.pos.noNA %>%
select(starts_with("FApM")) %>%
mutate_all(scale, center = TRUE, scale = FALSE) %>%
as.matrix() %>%
prcomp()
plot(
(fa.med.pos.full.pca$sdev ^ 2) * 100 / sum(fa.med.pos.full.pca$sdev ^ 2),
xlab = "Principal Component",
ylab = "Variance Explained",
main = "Percent variance explained by each principal component\nAll samples only\nFA-only / Media / Positive Mode",
type = "b"
)
fa.med.pos.full.pca.x <- as.data.frame(fa.med.pos.full.pca$x)
row.names(fa.med.pos.full.pca.x) <- fa.med.pos.noNA$Samples
fa.med.pos.full.pca.x <- fa.med.pos.full.pca.x %>%
bind_cols(fa.med.pos.noNA %>% select(Group:Experiment))
fa.med.pos.full.pca.x %>%
ggplot(aes(x = PC1, y = PC2, color = Group)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC1 (87.9% Var)") +
ylab("PC2 (6.0%)") +
ggtitle("Principal Component Analysis\nAll Samples\nFA-only / Media / Positive Mode")
### Samples and QC ###
fa.med.pos.smpl.QC.pca <- fa.med.pos.noNA %>%
filter(Group == "sample" | Group == "QC") %>%
select(starts_with("FApM")) %>%
mutate_all(scale, center = TRUE, scale = FALSE) %>%
as.matrix() %>%
prcomp()
plot(
(fa.med.pos.smpl.QC.pca$sdev ^ 2) * 100 / sum(fa.med.pos.smpl.QC.pca$sdev ^ 2),
xlab = "Principal Component",
ylab = "Variance Explained",
main = "Percent variance explained by each principal component\nSamples and QC\nFA-only / Media / Positive Mode",
type = "b"
)
fa.med.pos.smpl.QC.pca.x <- as.data.frame(fa.med.pos.smpl.QC.pca$x)
fa.med.pos.smpl.QC.pca.x <- fa.med.pos.smpl.QC.pca.x %>%
bind_cols(
fa.med.pos.noNA %>%
filter(Group == "sample" | Group == "QC") %>%
select(Samples, Group:Experiment)
)
row.names(fa.med.pos.smpl.QC.pca.x) <- fa.med.pos.smpl.QC.pca.x$Samples
fa.med.pos.smpl.QC.pca.x %>%
ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC1 (62.02% Var)") +
ylab("PC2 (11.2%)") +
ggtitle("Principal Component Analysis\nSamples and QC\nFA-only / Media / Positive Mode")
fa.med.pos.smpl.QC.pca.x %>%
ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC3 (8.5% Var)") +
ylab("PC4 (4.4%)") +
ggtitle("Principal Component Analysis\nSamples and QC\nFA-only / Media / Positive Mode")
### Experimental Samples Only ###
fa.med.pos.smpl.pca <- fa.med.pos.noNA %>%
filter(Group == "sample") %>%
select(starts_with("FApM")) %>%
mutate_all(scale, center = TRUE, scale = FALSE) %>%
as.matrix() %>%
prcomp()
plot(
(fa.med.pos.smpl.pca$sdev ^ 2) * 100 / sum(fa.med.pos.smpl.pca$sdev ^ 2),
xlab = "Principal Component",
ylab = "Variance Explained",
main = "Percent variance explained by each principal component\nExperimental samples only\nFA-only / Media / Positive Mode",
type = "b"
)
fa.med.pos.smpl.pca.x <- as.data.frame(fa.med.pos.smpl.pca$x)
fa.med.pos.smpl.pca.x <- fa.med.pos.smpl.pca.x %>%
bind_cols(
fa.med.pos.noNA %>%
filter(Group == "sample") %>%
select(Samples, Group:Experiment)
)
row.names(fa.med.pos.smpl.pca.x) <- fa.med.pos.smpl.pca.x$Samples
fa.med.pos.smpl.pca.x %>%
ggplot(aes(x = PC1, y = PC2, color = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC1 (55.3% Var)") +
ylab("PC2 (14.8%)") +
ggtitle("Principal Component Analysis\nExperimental samples only\nFA-only / Media / Positive Mode")
fa.med.pos.smpl.pca.x %>%
ggplot(aes(x = PC3, y = PC4, color = Treatment)) +
geom_point(size = 4, alpha = 0.8) +
xlab("PC3 (10.1% Var)") +
ylab("PC4 (5.1%)") +
ggtitle("Principal Component Analysis\nExperimental samples only\nFA-only / Media / Positive Mode")
# sample prep
fa.cell.neg.smpl.data <- fa.cell.neg.noNA %>%
filter(Group == "sample")
fa.cell.neg.data.for.sva <- as.matrix(
fa.cell.neg.smpl.data[, which(
colnames(fa.cell.neg.smpl.data) == "FAnC1"
):ncol(fa.cell.neg.smpl.data)]
)
row.names(fa.cell.neg.data.for.sva) <- fa.cell.neg.smpl.data$Samples
fa.cell.neg.data.for.sva <- t(fa.cell.neg.data.for.sva)
# pheno prep
fa.cell.neg.data.pheno <- as.data.frame(fa.cell.neg.smpl.data[, 5:7])
row.names(fa.cell.neg.data.pheno) <- fa.cell.neg.smpl.data$Samples
# sva calculation
fa.cell.neg.mod.fa <- model.matrix(~ as.factor(Treatment), data = fa.cell.neg.data.pheno)
fa.cell.neg.mod0 <- model.matrix(~ 1, data = fa.cell.neg.data.pheno)
set.seed(2018)
fa.cell.neg.sv <- sva(fa.cell.neg.data.for.sva, fa.cell.neg.mod.fa, fa.cell.neg.mod0)
Number of significant surrogate variables is: 4
Iteration (out of 5 ):1 2 3 4 5
# extract the surrogate variables
fa.cell.neg.surr.var <- as.data.frame(fa.cell.neg.sv$sv)
colnames(fa.cell.neg.surr.var) <- c("S1", "S2", "S3", "S4")
colnames(fa.cell.neg.mod.fa) <- c("cntrl", "FAvsCNTRL")
# combine the full model matrix and the surrogate variables into one
fa.cell.neg.d.sv <- cbind(fa.cell.neg.mod.fa, fa.cell.neg.surr.var)
head(fa.cell.neg.d.sv)
cntrl FAvsCNTRL S1 S2 S3
T1_P1_C1 1 0 -0.34666196 -0.098939211 0.10948600
T1_P1_C2 1 0 -0.25299878 0.153661274 -0.04195281
T1_P1_C3 1 0 -0.21635550 0.061003369 -0.04414992
T1_P1_FA1 1 1 -0.13405912 0.409568228 0.26852885
T1_P1_FA2 1 1 -0.29004084 0.001577421 0.02797650
T1_P1_FA3 1 1 -0.07911872 0.021463180 -0.30016937
S4
T1_P1_C1 0.406340329
T1_P1_C2 -0.026472150
T1_P1_C3 0.197947488
T1_P1_FA1 -0.206850992
T1_P1_FA2 0.005364182
T1_P1_FA3 -0.017343664
fa.cell.neg.top.table <- fa.cell.neg.data.for.sva %>%
# fit a linear model
lmFit(fa.cell.neg.d.sv) %>%
# calculate the test statistics
eBayes() %>%
# select the top features that have a p-value < 0.05 after Bonferroni multiple hypothesis correction
topTable(coef = "FAvsCNTRL", adjust = "bonferroni", p.value = 0.05, n = nrow(fa.cell.neg.data.for.sva))
# what the result looks like:
head(fa.cell.neg.top.table)
logFC AveExpr t P.Value adj.P.Val B
FAnC184 1.6081748 10.89884 9.861859 3.397357e-09 6.930608e-07 10.984129
FAnC186 1.2606197 11.14052 9.512399 6.244187e-09 1.273814e-06 10.359965
FAnC182 1.0904702 14.18294 6.665126 1.589223e-06 3.242015e-04 4.686504
FAnC17 -0.4425792 20.59570 -5.050769 5.832247e-05 1.189778e-02 1.025729
# make result more informative
fa.cell.neg.top.w.info <- fa.cell.neg.top.table %>%
rownames_to_column("compound_short") %>%
mutate(
fa_div_cntrl = 2 ^ logFC,
change_in_fa = ifelse(fa_div_cntrl < 1, "down", "up")
) %>%
inner_join(fa.cell.neg.compound.info, by = "compound_short") %>%
filter(fa_div_cntrl > 1.2 | fa_div_cntrl < 0.83) %>%
arrange(change_in_fa, desc(fa_div_cntrl))
fa.cell.neg.top.w.info %>%
select(compound_short, compound_full, change_in_fa, fa_div_cntrl)
compound_short compound_full change_in_fa fa_div_cntrl
1 FAnC17 Serine down 0.735818
2 FAnC184 5-Methyl-DHF up 3.048659
3 FAnC186 10-Formyl-DHF up 2.395986
4 FAnC182 Folic Acid up 2.129434
#write_csv(path = "./results/fa_cell_neg_top_hits_w_FC_pval.csv", fa.cell.neg.top.w.info)
fa.cell.neg.gathered <- fa.cell.neg.noNA %>%
filter(Group == "sample") %>%
bind_cols(fa.cell.neg.surr.var) %>%
select(Samples, Treatment, S1:S4, starts_with("FAnC")) %>%
gather(key = "Compound", value = "Abundance", FAnC1:FAnC99)
# structure so far:
glimpse(fa.cell.neg.gathered)
Observations: 4,896
Variables: 8
$ Samples <chr> "T1_P1_C1", "T1_P1_C2", "T1_P1_C3", "T1_P1_FA1", "T1...
$ Treatment <chr> "cntrl", "cntrl", "cntrl", "fa", "fa", "fa", "cntrl"...
$ S1 <dbl> -0.34666196, -0.25299878, -0.21635550, -0.13405912, ...
$ S2 <dbl> -0.098939211, 0.153661274, 0.061003369, 0.409568228,...
$ S3 <dbl> 0.10948600, -0.04195281, -0.04414992, 0.26852885, 0....
$ S4 <dbl> 0.4063403290, -0.0264721499, 0.1979474876, -0.206850...
$ Compound <chr> "FAnC1", "FAnC1", "FAnC1", "FAnC1", "FAnC1", "FAnC1"...
$ Abundance <dbl> 17.13956, 17.03877, 16.57158, 17.49617, 16.35237, 16...
fa.cell.neg.nested <- fa.cell.neg.gathered %>%
group_by(Compound) %>%
nest() %>%
# apply a linear model to each individual compound, as a function of the surrogate variables
mutate(model = map(data, ~lm(Abundance ~ S1 + S2 + S3 + S4, data = .))) %>%
# use broom to tidy up the output
mutate(augment_model = map(model, augment))
# result to far:
fa.cell.neg.nested
# A tibble: 204 x 4
Compound data model augment_model
<chr> <list> <list> <list>
1 FAnC1 <tibble [24 x 7]> <S3: lm> <tibble [24 x 12]>
2 FAnC10 <tibble [24 x 7]> <S3: lm> <tibble [24 x 12]>
3 FAnC100 <tibble [24 x 7]> <S3: lm> <tibble [24 x 12]>
4 FAnC101 <tibble [24 x 7]> <S3: lm> <tibble [24 x 12]>
5 FAnC102 <tibble [24 x 7]> <S3: lm> <tibble [24 x 12]>
6 FAnC103 <tibble [24 x 7]> <S3: lm> <tibble [24 x 12]>
7 FAnC104 <tibble [24 x 7]> <S3: lm> <tibble [24 x 12]>
8 FAnC105 <tibble [24 x 7]> <S3: lm> <tibble [24 x 12]>
9 FAnC106 <tibble [24 x 7]> <S3: lm> <tibble [24 x 12]>
10 FAnC107 <tibble [24 x 7]> <S3: lm> <tibble [24 x 12]>
# ... with 194 more rows
# now to get the residuals out for each compound
fa.cell.neg.modSV.resid <- fa.cell.neg.nested %>%
unnest(data, augment_model) %>%
select(Samples, Treatment, Compound, .resid) %>%
# return to long format
spread(Compound, .resid)
glimpse(fa.cell.neg.modSV.resid[, 1:5])
Observations: 24
Variables: 5
$ Samples <chr> "T1_P1_C1", "T1_P1_C2", "T1_P1_C3", "T1_P1_FA1", "T1...
$ Treatment <chr> "cntrl", "cntrl", "cntrl", "fa", "fa", "fa", "cntrl"...
$ FAnC1 <dbl> -0.067354144, 0.051006315, -0.265423042, 0.337389264...
$ FAnC10 <dbl> -0.122427613, 0.066678368, -0.363314938, 0.385126292...
$ FAnC100 <dbl> 0.0614190339, 0.0156849534, 0.0265461444, -0.0969954...
fa.cell.neg.modSV.resid %>%
select(Samples, one_of(fa.cell.neg.top.w.info$compound_short)) %>%
HeatmapPrepAlt("FAnC") %>%
t() %>%
heatmaply(
colors = viridis(n = 10, option = "magma"),
xlab = "Samples", ylab = "Compounds",
main = "Statistically significant compounds\nFA-Only Experiment / Cells / Negative Mode",
margins = c(50, 50, 75, 30),
labRow = fa.cell.neg.top.w.info$compound_full,
k_col = 2, k_row = 2
)
fa.cell.pos.smpl.data <- fa.cell.pos.noNA %>%
filter(Group == "sample")
fa.cell.pos.data.for.sva <- as.matrix(
fa.cell.pos.smpl.data[, which(
colnames(fa.cell.pos.smpl.data) == "FApC1"
):ncol(fa.cell.pos.smpl.data)]
)
row.names(fa.cell.pos.data.for.sva) <- fa.cell.pos.smpl.data$Samples
fa.cell.pos.data.for.sva <- t(fa.cell.pos.data.for.sva)
fa.cell.pos.data.pheno <- as.data.frame(fa.cell.pos.smpl.data[, 5:7])
row.names(fa.cell.pos.data.pheno) <- fa.cell.pos.smpl.data$Samples
fa.cell.pos.mod.fa <- model.matrix(~ as.factor(Treatment), data = fa.cell.pos.data.pheno)
fa.cell.pos.mod0 <- model.matrix(~ 1, data = fa.cell.pos.data.pheno)
set.seed(2018)
fa.cell.pos.sv <- sva(fa.cell.pos.data.for.sva, fa.cell.pos.mod.fa, fa.cell.pos.mod0)
Number of significant surrogate variables is: 3
Iteration (out of 5 ):1 2 3 4 5
fa.cell.pos.surr.var <- as.data.frame(fa.cell.pos.sv$sv)
colnames(fa.cell.pos.surr.var) <- c("S1", "S2", "S3")
colnames(fa.cell.pos.mod.fa) <- c("cntrl", "FAvsCNTRL")
fa.cell.pos.d.sv <- cbind(fa.cell.pos.mod.fa, fa.cell.pos.surr.var)
head(fa.cell.pos.d.sv)
cntrl FAvsCNTRL S1 S2 S3
T1_P1_C1 1 0 -0.31421328 -0.01485533 0.290220454
T1_P1_C2 1 0 -0.25463211 -0.16793123 -0.002767888
T1_P1_C3 1 0 -0.18110404 -0.07376907 -0.012989997
T1_P1_FA1 1 1 -0.21726705 -0.01420531 -0.345465446
T1_P1_FA2 1 1 -0.25160936 -0.10623941 0.069442593
T1_P1_FA3 1 1 -0.09146465 -0.05419054 -0.179265925
fa.cell.pos.top.table <- fa.cell.pos.data.for.sva %>%
lmFit(fa.cell.pos.d.sv) %>%
eBayes() %>%
topTable(coef = "FAvsCNTRL", adjust = "bonferroni", p.value = 0.05, n = nrow(fa.cell.pos.data.for.sva))
fa.cell.pos.top.w.info <- fa.cell.pos.top.table %>%
rownames_to_column("compound_short") %>%
mutate(
fa_div_cntrl = 2 ^ logFC,
change_in_fa = ifelse(fa_div_cntrl < 1, "down", "up")
) %>%
inner_join(fa.cell.pos.compound.info, by = "compound_short") %>%
filter(fa_div_cntrl > 1.2 | fa_div_cntrl < 0.83) %>%
arrange(change_in_fa, desc(fa_div_cntrl))
fa.cell.pos.top.w.info %>%
select(compound_short, compound_full, change_in_fa, fa_div_cntrl)
compound_short compound_full change_in_fa fa_div_cntrl
1 FApC9 Serine down 0.7193909
2 FApC124 5-Methyl-THF up 1.6063569
#write_csv(path = "./results/fa_cell_pos_top_hits_w_FC_pval.csv", fa.cell.pos.top.w.info)
fa.cell.pos.gathered <- fa.cell.pos.noNA %>%
filter(Group == "sample") %>%
bind_cols(fa.cell.pos.surr.var) %>%
select(Samples, Treatment, S1:S3, starts_with("FApC")) %>%
gather(key = "Compound", value = "Abundance", FApC1:FApC99)
fa.cell.pos.nested <- fa.cell.pos.gathered %>%
group_by(Compound) %>%
nest() %>%
mutate(model = map(data, ~lm(Abundance ~ S1 + S2 + S3, data = .))) %>%
mutate(augment_model = map(model, augment))
fa.cell.pos.modSV.resid <- fa.cell.pos.nested %>%
unnest(data, augment_model) %>%
select(Samples, Treatment, Compound, .resid) %>%
spread(Compound, .resid)
fa.cell.pos.modSV.resid %>%
select(Samples, one_of(fa.cell.pos.top.w.info$compound_short)) %>%
HeatmapPrepAlt("FApC") %>%
t() %>%
heatmaply(
colors = viridis(n = 10, option = "magma"),
xlab = "Samples", ylab = "Compounds",
main = "Statistically significant compounds\nFA-Only Experiment / Cells / Positive Mode",
margins = c(50, 50, 75, 30),
labRow = fa.cell.pos.top.w.info$compound_full,
k_col = 2, k_row = 2
)
fa.med.neg.smpl.data <- fa.med.neg.noNA %>%
filter(Group == "sample")
fa.med.neg.data.for.sva <- as.matrix(
fa.med.neg.smpl.data[, which(
colnames(fa.med.neg.smpl.data) == "FAnM1"
):ncol(fa.med.neg.smpl.data)]
)
row.names(fa.med.neg.data.for.sva) <- fa.med.neg.smpl.data$Samples
fa.med.neg.data.for.sva <- t(fa.med.neg.data.for.sva)
fa.med.neg.data.pheno <- as.data.frame(fa.med.neg.smpl.data[, 5:7])
row.names(fa.med.neg.data.pheno) <- fa.med.neg.smpl.data$Samples
fa.med.neg.mod.fa <- model.matrix(~ as.factor(Treatment), data = fa.med.neg.data.pheno)
fa.med.neg.mod0 <- model.matrix(~ 1, data = fa.med.neg.data.pheno)
set.seed(2018)
fa.med.neg.sv <- sva(fa.med.neg.data.for.sva, fa.med.neg.mod.fa, fa.med.neg.mod0)
Number of significant surrogate variables is: 2
Iteration (out of 5 ):1 2 3 4 5
fa.med.neg.surr.var <- as.data.frame(fa.med.neg.sv$sv)
colnames(fa.med.neg.surr.var) <- c("S1", "S2")
colnames(fa.med.neg.mod.fa) <- c("cntrl", "FAvsCNTRL")
fa.med.neg.d.sv <- cbind(fa.med.neg.mod.fa, fa.med.neg.surr.var)
head(fa.med.neg.d.sv)
cntrl FAvsCNTRL S1 S2
T1_P1_C1 1 0 -0.09995191 -0.1919621
T1_P1_C2 1 0 -0.15623253 -0.1795639
T1_P1_C3 1 0 -0.27423439 0.2990248
T1_P1_FA1 1 1 -0.12610239 -0.1220748
T1_P1_FA2 1 1 -0.12812393 -0.2029587
T1_P1_FA3 1 1 -0.21880953 -0.1672745
fa.med.neg.top.table <- fa.med.neg.data.for.sva %>%
lmFit(fa.med.neg.d.sv) %>%
eBayes() %>%
topTable(coef = "FAvsCNTRL", adjust = "bonferroni", p.value = 0.05, n = nrow(fa.med.neg.data.for.sva))
fa.med.neg.top.w.info <- fa.med.neg.top.table %>%
rownames_to_column("compound_short") %>%
mutate(
fa_div_cntrl = 2 ^ logFC,
change_in_fa = ifelse(fa_div_cntrl < 1, "down", "up")
) %>%
inner_join(fa.med.neg.compound.info, by = "compound_short") %>%
filter(fa_div_cntrl > 1.2 | fa_div_cntrl < 0.83) %>%
arrange(change_in_fa, desc(fa_div_cntrl))
fa.med.neg.top.w.info %>%
select(compound_short, compound_full, change_in_fa, fa_div_cntrl)
compound_short compound_full change_in_fa fa_div_cntrl
1 FAnM61 Dihydrofolic Acid up 6.618852
2 FAnM60 Folic Acid up 4.492412
#write_csv(path = "./results/fa_med_neg_top_hits_w_FC_pval.csv", fa.med.neg.top.w.info)
fa.med.neg.gathered <- fa.med.neg.noNA %>%
filter(Group == "sample") %>%
bind_cols(fa.med.neg.surr.var) %>%
select(Samples, Treatment, S1:S2, starts_with("FAnM")) %>%
gather(key = "Compound", value = "Abundance", FAnM1:FAnM8)
fa.med.neg.nested <- fa.med.neg.gathered %>%
group_by(Compound) %>%
nest() %>%
mutate(model = map(data, ~lm(Abundance ~ S1 + S2, data = .))) %>%
mutate(augment_model = map(model, augment))
fa.med.neg.modSV.resid <- fa.med.neg.nested %>%
unnest(data, augment_model) %>%
select(Samples, Treatment, Compound, .resid) %>%
spread(Compound, .resid)
glimpse(fa.med.neg.modSV.resid[, 1:5])
Observations: 24
Variables: 5
$ Samples <chr> "T1_P1_C1", "T1_P1_C2", "T1_P1_C3", "T1_P1_FA1", "T1...
$ Treatment <chr> "cntrl", "cntrl", "cntrl", "fa", "fa", "fa", "cntrl"...
$ FAnM1 <dbl> 0.0084053692, -0.0257309954, -0.0513950496, 0.006788...
$ FAnM10 <dbl> -0.002899808, 0.008644288, -0.039545617, -0.01113359...
$ FAnM12 <dbl> -0.027355739, 0.108203107, 0.133830462, -0.040861511...
fa.med.neg.modSV.resid %>%
select(Samples, one_of(fa.med.neg.top.w.info$compound_short)) %>%
HeatmapPrepAlt("FAnM") %>%
t() %>%
heatmaply(
colors = viridis(n = 10, option = "magma"),
xlab = "Samples", ylab = "Compounds",
main = "Statistically significant compounds\nFA-Only Experiment / Media / Negative Mode",
margins = c(50, 50, 75, 30),
labRow = fa.med.neg.top.w.info$compound_full,
k_col = 2, k_row = 2
)
fa.med.pos.smpl.data <- fa.med.pos.noNA %>%
filter(Group == "sample")
fa.med.pos.data.for.sva <- as.matrix(
fa.med.pos.smpl.data[, which(
colnames(fa.med.pos.smpl.data) == "FApM1"
):ncol(fa.med.pos.smpl.data)]
)
row.names(fa.med.pos.data.for.sva) <- fa.med.pos.smpl.data$Samples
fa.med.pos.data.for.sva <- t(fa.med.pos.data.for.sva)
fa.med.pos.data.pheno <- as.data.frame(fa.med.pos.smpl.data[, 5:7])
row.names(fa.med.pos.data.pheno) <- fa.med.pos.smpl.data$Samples
fa.med.pos.mod.fa <- model.matrix(~ as.factor(Treatment), data = fa.med.pos.data.pheno)
fa.med.pos.mod0 <- model.matrix(~ 1, data = fa.med.pos.data.pheno)
set.seed(2018)
fa.med.pos.sv <- sva(fa.med.pos.data.for.sva, fa.med.pos.mod.fa, fa.med.pos.mod0)
Number of significant surrogate variables is: 1
Iteration (out of 5 ):1 2 3 4 5
fa.med.pos.surr.var <- as.data.frame(fa.med.pos.sv$sv)
colnames(fa.med.pos.surr.var) <- c("S1")
colnames(fa.med.pos.mod.fa) <- c("cntrl", "FAvsCNTRL")
fa.med.pos.d.sv <- cbind(fa.med.pos.mod.fa, fa.med.pos.surr.var)
head(fa.med.pos.d.sv)
cntrl FAvsCNTRL S1
T1_P1_C1 1 0 -0.17743847
T1_P1_C2 1 0 -0.20473498
T1_P1_C3 1 0 -0.03404904
T1_P1_FA1 1 1 -0.20311683
T1_P1_FA2 1 1 -0.14711451
T1_P1_FA3 1 1 -0.16289143
fa.med.pos.top.table <- fa.med.pos.data.for.sva %>%
lmFit(fa.med.pos.d.sv) %>%
eBayes() %>%
topTable(coef = "FAvsCNTRL", adjust = "bonferroni", p.value = 0.05, n = nrow(fa.med.pos.data.for.sva))
fa.med.pos.top.w.info <- fa.med.pos.top.table %>%
rownames_to_column("compound_short") %>%
mutate(
fa_div_cntrl = 2 ^ logFC,
change_in_fa = ifelse(fa_div_cntrl < 1, "down", "up")
) %>%
inner_join(fa.med.pos.compound.info, by = "compound_short") %>%
filter(fa_div_cntrl > 1.2 | fa_div_cntrl < 0.83) %>%
arrange(change_in_fa, desc(fa_div_cntrl))
fa.med.pos.top.w.info %>%
select(compound_short, compound_full, change_in_fa, fa_div_cntrl)
compound_short compound_full change_in_fa fa_div_cntrl
1 FApM59 Folic Acid up 4.083554
#write_csv(path = "./results/fa_med_pos_top_hits_w_FC_pval.csv", fa.med.pos.top.w.info)
fa.med.pos.gathered <- fa.med.pos.noNA %>%
filter(Group == "sample") %>%
bind_cols(fa.med.pos.surr.var) %>%
select(Samples, Treatment, S1, starts_with("FApM")) %>%
gather(key = "Compound", value = "Abundance", FApM1:FApM9)
fa.med.pos.nested <- fa.med.pos.gathered %>%
group_by(Compound) %>%
nest() %>%
mutate(model = map(data, ~lm(Abundance ~ S1, data = .))) %>%
mutate(augment_model = map(model, augment))
fa.med.pos.modSV.resid <- fa.med.pos.nested %>%
unnest(data, augment_model) %>%
select(Samples, Treatment, Compound, .resid) %>%
spread(Compound, .resid)
# only one compound was statistically significant = just plot folic acid
fa.med.pos.modSV.resid %>%
select(Samples:Treatment, one_of(fa.med.pos.top.w.info$compound_short)) %>%
ggplot(aes(Treatment, FApM59, color = Treatment)) +
geom_boxplot() +
geom_jitter(size = 2) +
ylab("Folic Acid")
fa.full.hit.list <- fa.cell.neg.top.w.info %>%
mutate(Mode = "cell.neg") %>%
bind_rows(
fa.cell.pos.top.w.info %>%
mutate(Mode = "cell.pos")
) %>%
bind_rows(
fa.med.neg.top.w.info %>%
mutate(Mode = "med.neg")
) %>%
bind_rows(
fa.med.pos.top.w.info %>%
mutate(Mode = "med.pos")
) %>%
as.tibble()
fa.full.hit.list %>%
select(compound_full:formula, cas_id) %>%
distinct() %>%
arrange(compound_full)
# A tibble: 6 x 3
compound_full formula cas_id
<chr> <chr> <chr>
1 10-Formyl-DHF C20 H21 N7 O7 28459-40-7
2 5-Methyl-DHF C20 H23 N7 O6 59904-24-4
3 5-Methyl-THF C20 H25 N7 O6 134-35-0
4 Dihydrofolic Acid C19 H21 N7 O6 4033-27-6
5 Folic Acid C19 H19 N7 O6 59-30-3
6 Serine C3 H7 N O3 56-45-1
fa.sml.hit.list <- fa.full.hit.list %>%
select(compound_short, compound_full, Mode, cas_id)
#write_csv(path = "./results/fa_hit_assignment.csv", fa.sml.hit.list)
fa.plot.order <- read_csv("./data/pathways/fa_hit_assignment.csv")
Parsed with column specification:
cols(
compound_short = col_character(),
compound_full = col_character(),
Mode = col_character(),
cas_id = col_character(),
Source = col_character(),
Plot_Name = col_character()
)
### Cell Hits Plot ###
fa.cell.hits <- fa.plot.order %>%
filter(Source == "C") %>%
mutate(plot_order = factor(Plot_Name, levels = Plot_Name))
fa.cell.data <- fa.cell.neg.modSV.resid %>%
inner_join(fa.cell.pos.modSV.resid %>% select(-Treatment), by = "Samples") %>%
mutate_at(vars(matches("FA")), scale) %>%
gather(key = "compound_short", value = "Abundance", -Samples, -Treatment) %>%
inner_join(fa.cell.hits, by = "compound_short")
fa.cell.sig.plot <- fa.cell.data %>%
ggplot(aes(plot_order, Abundance, color = Treatment)) +
geom_boxplot(position = position_dodge(0.8)) +
geom_jitter(size = 2, alpha = 0.5, position = position_dodge(0.8)) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
panel.background = element_rect(fill = "gray90")
) +
xlab("") +
ylab("") +
scale_color_manual(values = c("#56B4E9", "#0072B2"), labels = c("Control", "FA")) +
ylim(-2.5, 2.5)
fa.cell.sig.plot
### Media Hits Plot ###
fa.med.hits <- fa.plot.order %>%
filter(Source == "M") %>%
mutate(plot_order = factor(Plot_Name, levels = Plot_Name))
fa.med.data <- fa.med.neg.modSV.resid %>%
inner_join(fa.med.pos.modSV.resid %>% select(-Treatment), by = "Samples") %>%
mutate_at(vars(matches("FA")), scale) %>%
gather(key = "compound_short", value = "Abundance", -Samples, -Treatment) %>%
inner_join(fa.med.hits, by = "compound_short")
fa.med.sig.plot <- fa.med.data %>%
ggplot(aes(plot_order, Abundance, color = Treatment)) +
geom_boxplot(position = position_dodge(0.8)) +
geom_jitter(size = 2, alpha = 0.5, position = position_dodge(0.8)) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
panel.background = element_rect(fill = "gray90"),
legend.justification = "top"
) +
xlab("") +
ylab("") +
scale_color_manual(values = c("#56B4E9", "#0072B2"), labels = c("Control", "FA")) +
ylim(-2.5, 2.5)
fa.med.sig.plot
### all together
fa.legend <- get_legend(fa.med.sig.plot)
fa.sig.plot.grid <- plot_grid(
fa.cell.sig.plot + theme(legend.position = "none", plot.margin = unit(c(0,6,0,0), "pt")),
plot_grid(
fa.med.sig.plot + theme(legend.position = "none", plot.margin = unit(c(0,6,0,0), "pt")),
fa.legend,
rel_widths = c(1.1, 0.9)
),
ncol = 1, labels = c("A", "B"),
rel_heights = c(1, 1)
)
fa.sig.plot.grid
#save_plot("./results/fa_only_exp_sig.png", fa.sig.plot.grid, base_width = 8, base_height = 8)
sessionInfo()
R version 3.5.1 (2018-07-02)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 7 x64 (build 7601) Service Pack 1
Matrix products: default
locale:
[1] LC_COLLATE=English_United States.1252
[2] LC_CTYPE=English_United States.1252
[3] LC_MONETARY=English_United States.1252
[4] LC_NUMERIC=C
[5] LC_TIME=English_United States.1252
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] bindrcpp_0.2.2 ggridges_0.5.1 broom_0.5.0
[4] limma_3.36.5 sva_3.28.0 BiocParallel_1.14.2
[7] genefilter_1.62.0 mgcv_1.8-25 nlme_3.1-137
[10] heatmaply_0.15.2 viridis_0.5.1 viridisLite_0.3.0
[13] plotly_4.8.0 GGally_1.4.0 cowplot_0.9.3
[16] forcats_0.3.0 stringr_1.3.1 dplyr_0.7.8
[19] purrr_0.2.5 readr_1.1.1 tidyr_0.8.2
[22] tibble_1.4.2 ggplot2_3.1.0 tidyverse_1.2.1
loaded via a namespace (and not attached):
[1] colorspace_1.3-2 class_7.3-14 modeltools_0.2-22
[4] mclust_5.4.1 rprojroot_1.3-2 rstudioapi_0.8
[7] flexmix_2.3-14 bit64_0.9-7 fansi_0.4.0
[10] AnnotationDbi_1.42.1 mvtnorm_1.0-8 lubridate_1.7.4
[13] xml2_1.2.0 splines_3.5.1 codetools_0.2-15
[16] robustbase_0.93-3 knitr_1.20 jsonlite_1.5
[19] annotate_1.58.0 cluster_2.0.7-1 kernlab_0.9-27
[22] shiny_1.2.0 compiler_3.5.1 httr_1.3.1
[25] backports_1.1.2 assertthat_0.2.0 Matrix_1.2-15
[28] lazyeval_0.2.1 cli_1.0.1 later_0.7.5
[31] htmltools_0.3.6 tools_3.5.1 gtable_0.2.0
[34] glue_1.3.0 reshape2_1.4.3 Rcpp_1.0.0
[37] Biobase_2.40.0 cellranger_1.1.0 trimcluster_0.1-2.1
[40] gdata_2.18.0 crosstalk_1.0.0 iterators_1.0.10
[43] fpc_2.1-11.1 rvest_0.3.2 mime_0.6
[46] gtools_3.8.1 XML_3.98-1.16 dendextend_1.9.0
[49] DEoptimR_1.0-8 MASS_7.3-51.1 scales_1.0.0
[52] TSP_1.1-6 promises_1.0.1 hms_0.4.2
[55] parallel_3.5.1 RColorBrewer_1.1-2 yaml_2.2.0
[58] memoise_1.1.0 gridExtra_2.3 reshape_0.8.8
[61] stringi_1.2.4 RSQLite_2.1.1 gclus_1.3.1
[64] S4Vectors_0.18.3 foreach_1.4.4 seriation_1.2-3
[67] caTools_1.17.1.1 BiocGenerics_0.26.0 matrixStats_0.54.0
[70] rlang_0.3.0.1 pkgconfig_2.0.2 prabclus_2.2-6
[73] bitops_1.0-6 evaluate_0.12 lattice_0.20-38
[76] bindr_0.1.1 labeling_0.3 htmlwidgets_1.3
[79] bit_1.1-14 tidyselect_0.2.5 plyr_1.8.4
[82] magrittr_1.5 R6_2.3.0 IRanges_2.14.12
[85] gplots_3.0.1 DBI_1.0.0 pillar_1.3.0
[88] haven_1.1.2 whisker_0.3-2 withr_2.1.2
[91] survival_2.43-1 RCurl_1.95-4.11 nnet_7.3-12
[94] modelr_0.1.2 crayon_1.3.4 utf8_1.1.4
[97] KernSmooth_2.23-15 rmarkdown_1.10 grid_3.5.1
[100] readxl_1.1.0 data.table_1.11.8 blob_1.1.1
[103] digest_0.6.18 diptest_0.75-7 webshot_0.5.1
[106] xtable_1.8-3 httpuv_1.4.5 stats4_3.5.1
[109] munsell_0.5.0 registry_0.5